In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

Series

pd.Series(data, index, name)


In [2]:
# Series
# s = pd.Series(data, index = index)
s = pd.Series(np.random.randn(5), index = ['a','b','c','d','e'])
s


Out[2]:
a   -0.616345
b   -1.020763
c   -1.874393
d    0.835360
e   -0.330025
dtype: float64

In [3]:
# To get the index
s.index


Out[3]:
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [4]:
duplicate_index = pd.Series(np.random.randn(4), index = ['a', 'a', 'c', 'd'])
duplicate_index


Out[4]:
a   -1.143204
a   -0.623465
c    0.655221
d    1.632808
dtype: float64

In [5]:
duplicate_index['a']


Out[5]:
a   -1.143204
a   -0.623465
dtype: float64

In [6]:
# can pass dict to a series where keys of a dict become index of the series

# Series same as ndarray so it can be passed as argument to most Numpy functions
# Slicing also works for the index of Series

# Index can also be represented as standard numbers
s[0]


Out[6]:
-0.61634508374071517

In [7]:
s[0:3]


Out[7]:
a   -0.616345
b   -1.020763
c   -1.874393
dtype: float64

In [8]:
s


Out[8]:
a   -0.616345
b   -1.020763
c   -1.874393
d    0.835360
e   -0.330025
dtype: float64

In [9]:
# Series is also a fixed size dict
# So all operations of dict also work for Series

# You can also explicitly add new indexes
s['f'] = 10

In [10]:
s


Out[10]:
a    -0.616345
b    -1.020763
c    -1.874393
d     0.835360
e    -0.330025
f    10.000000
dtype: float64

In [12]:
# Name
s = pd.Series(np.random.randn(6), name='Something')
s


Out[12]:
0   -0.086604
1   -0.848126
2    0.600843
3   -0.056080
4   -0.412875
5   -1.192173
Name: Something, dtype: float64

In [13]:
s.name


Out[13]:
'Something'

DataFrame


In [41]:
df = pd.DataFrame({'one':pd.Series([1.,2.,3.], index=['a', 'b', 'c']),
                  'two':pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])})
df


Out[41]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0

In [16]:
df.index


Out[16]:
Index(['a', 'b', 'c', 'd'], dtype='object')

In [17]:
df.columns


Out[17]:
Index(['one', 'two'], dtype='object')

In [21]:
# DataFrame does not function in the same way as numpy 2-D array

# Treat DataFrame as a dict of like-indexed Series objects

In [22]:
df['one']


Out[22]:
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [24]:
# You first select the Series and then the row you want
df['one']['a']


Out[24]:
1.0

In [26]:
df['three'] = df['one'] + df['two']
df


Out[26]:
one two three
a 1.0 1.0 2.0
b 2.0 2.0 4.0
c 3.0 3.0 6.0
d NaN 4.0 NaN

In [28]:
df['flag'] = df['one']  > 2
df


Out[28]:
one two three flag
a 1.0 1.0 2.0 False
b 2.0 2.0 4.0 False
c 3.0 3.0 6.0 True
d NaN 4.0 NaN False

In [29]:
del df['flag']
df


Out[29]:
one two three
a 1.0 1.0 2.0
b 2.0 2.0 4.0
c 3.0 3.0 6.0
d NaN 4.0 NaN

In [30]:
three = df.pop('three')
three


Out[30]:
a    2.0
b    4.0
c    6.0
d    NaN
Name: three, dtype: float64

In [31]:
# You can almost do all operations of a dictionary assuming that
# keys are columns names and the values are the complete column

# Remember df['name'] selects then 'name' Series in the df
# Now it can be treated as a simple ndarray.
# To access it's the element of df['name'] simply use df['name']['index']

In [32]:
df


Out[32]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0

In [33]:
df['one'][0]


Out[33]:
1.0

In [35]:
# assign
# To create a new column from a combination of the existing columns
df.assign(ratio = df['one']/df['two'])


Out[35]:
one two ratio
a 1.0 1.0 1.0
b 2.0 2.0 1.0
c 3.0 3.0 1.0
d NaN 4.0 NaN

In [36]:
df.assign(check = df['one'] > 2)


Out[36]:
one two check
a 1.0 1.0 False
b 2.0 2.0 False
c 3.0 3.0 True
d NaN 4.0 False

In [45]:
# If you want to do condition of something while assigning
df.query('two > 2').assign(eg1 = lambda x: x.two)

# You can also use this syntax if you want some a colum filled with
# values meeting some required conditon


Out[45]:
one two eg1
c 3.0 3.0 3.0
d NaN 4.0 4.0

In [46]:
# To apply multiple conditions

df.query('two > 2 & one != 3').assign(eg2 = lambda x : x.two)


Out[46]:
one two eg2
d NaN 4.0 4.0

In [ ]:
# Indexing rules

# df['col_name'] = Return the complete Series i.e. column
# df.loc['label'] = Returns the entire row in the form of a Series
# df.iloc[label_in_integer_locations] = same as above
# df[5:10] = Slice the rows
# df(bool_vec) = Select rows by boolen vector

In [50]:
df


Out[50]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0

In [51]:
df.loc['b']


Out[51]:
one    2.0
two    2.0
Name: b, dtype: float64

In [52]:
df.iloc[1]


Out[52]:
one    2.0
two    2.0
Name: b, dtype: float64

In [56]:
# Alignment and Arithematic
df1 = pd.DataFrame(np.random.randn(10, 4), columns = ['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7 , 3), columns = ['A', 'B', 'C'])

In [57]:
df1


Out[57]:
A B C D
0 0.588785 1.198493 -1.631852 1.474286
1 -0.862017 -0.546645 -0.163934 0.278578
2 -1.357200 1.392881 1.134246 -0.362245
3 0.597219 -1.041849 0.127411 -0.227187
4 1.636223 -1.049857 -0.021015 0.539049
5 0.301174 -1.576095 0.987439 1.429394
6 -0.300917 2.874198 -0.596179 1.625892
7 -0.396302 -0.996899 -0.095010 0.259675
8 1.331846 -0.683059 0.889802 1.492522
9 0.709617 0.145721 2.222681 -1.033920

In [58]:
df2


Out[58]:
A B C
0 1.762065 2.699884 -0.310927
1 -0.100875 -0.508357 1.682844
2 0.595381 0.081955 0.069323
3 -1.445428 0.423870 0.088223
4 -0.923425 0.081247 -1.525817
5 0.098620 2.393215 0.446321
6 0.037939 -0.149339 2.192394

In [59]:
# To add the two dataframes
# The missing labels and columns would be treated as Nan
df1+df2


Out[59]:
A B C D
0 2.350849 3.898377 -1.942780 NaN
1 -0.962892 -1.055002 1.518910 NaN
2 -0.761819 1.474836 1.203569 NaN
3 -0.848209 -0.617979 0.215635 NaN
4 0.712798 -0.968610 -1.546831 NaN
5 0.399794 0.817121 1.433760 NaN
6 -0.262978 2.724859 1.596215 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN

In [64]:
# When operating among Series and DatFrame align the index 
# of the Series with columns of the DatFrame

(df1+df2)*2


Out[64]:
A B C D
0 4.701699 7.796754 -3.885559 NaN
1 -1.925784 -2.110004 3.037820 NaN
2 -1.523638 2.949672 2.407138 NaN
3 -1.696418 -1.235958 0.431270 NaN
4 1.425596 -1.937219 -3.093663 NaN
5 0.799588 1.634241 2.867520 NaN
6 -0.525956 5.449719 3.192430 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN

In [70]:
# You can explicitly change the type of all elements of a DatFrame
df = pd.DataFrame({'a':[1,0,1], 'b':[0,1,1]}, dtype=np.float32)
df


Out[70]:
a b
0 1.0 0.0
1 0.0 1.0
2 1.0 1.0

In [76]:
df.dtypes


Out[76]:
a    float32
b    float32
dtype: object

In [77]:
df.T


Out[77]:
0 1 2
a 1.0 0.0 1.0
b 0.0 1.0 1.0

In [81]:
# All arithmematic functions of numpy can be applied to DataFrames
# given that all elements are numeric
df = pd.DataFrame({'one':[np.NaN, 1, 2]})
print(df)

# Due to the latest release of v.22 operations on NaN simply ignore
# NaN and return the result of operations as NaN rather than giving 
# error as in earlier version
np.log(df)


   one
0  NaN
1  1.0
2  2.0
Out[81]:
one
0 NaN
1 0.000000
2 0.693147

Console Display


In [87]:
# Use info() to get all information about a DatFrame
df = pd.DataFrame({'one':pd.Series([1.,2.,3.], index=['a', 'b', 'c']),
                  'two':pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])})
df


Out[87]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0

In [88]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
one    3 non-null float64
two    4 non-null float64
dtypes: float64(2)
memory usage: 96.0+ bytes

In [ ]:
# How much to print on each line
pd.set_option('display.width', 40)

# Max-width of individual columns
pd.set_option('display.max_colwidth', 30)

Panel


In [89]:
# For making 3-D arrays 
# Axis names
# 1) items -> axis =0
# 2) major_axis -> axis = 1(index)
# 3) minor_axis -> axis = 2(columns)

In [90]:
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], 
             major_axis = pd.date_range('9/1/2018', periods = 5),
             minor_axis = ['A', 'B', 'C', 'D'])

wp


Out[90]:
<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 5 (major_axis) x 4 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 2018-09-01 00:00:00 to 2018-09-05 00:00:00
Minor_axis axis: A to D

In [92]:
# Creating from dict
data = {'Item1':pd.DataFrame(np.random.randn(4,3)),
              'Item2':pd.DataFrame(np.random.randn(4,2))}
wp = pd.Panel(data)

wp


Out[92]:
<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 3 (minor_axis)
Items axis: Item1 to Item2
Major_axis axis: 0 to 3
Minor_axis axis: 0 to 2